package spark.pipeline

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.feature.{HashingTF, IDF, StopWordsRemover, Tokenizer}
import org.apache.spark.sql.SparkSession

object HashingTFTest {
  def main(args:Array[String]): Unit ={
    val spark = SparkSession.builder().master("local[1]").appName("DataClear").getOrCreate()

    val sentenceData = spark.createDataFrame(Seq(
      (0,"I heard about Spark and I love Spark"),
      (1,"I wish Java could use case classes"),
      (2,"Logistic regression models are neat")
    )).toDF("label","sentence")

    val tokenizer = new Tokenizer()
      .setInputCol("sentence")
      .setOutputCol("words")
    val wordsData = tokenizer.transform(sentenceData)
    wordsData.show(false)

    val stopWordsRemover = new StopWordsRemover()
      .setInputCol("words")
      .setOutputCol("wordsRemover")
    val wordsRemover = stopWordsRemover.transform(wordsData)
    wordsRemover.show(false)

    val hashingTF = new HashingTF()
      .setInputCol("wordsRemover")
      .setOutputCol("rawFeatures")
      .setNumFeatures(2000)
    val featuredData = hashingTF.transform(wordsRemover)
    featuredData.show(false)

    val idf = new IDF()
      .setInputCol("rawFeatures")
      .setOutputCol("features")
    val rescaledData = idf.fit(featuredData).transform(featuredData)
    rescaledData.show(false)

    spark.close()


  }
}
